library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(readr)
TRANSIT <- read_csv("TRANSIT.csv")
## Rows: 283 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): TRANSIT
## date (1): DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Transit_Raw <- TRANSIT$TRANSIT
# Plot and Inference
Transit_ts <- ts(Transit_Raw, frequency = 12, start = c(2000,1))
plot(Transit_ts)

# The series displays consistent seasonal variation and stable ridership before 2020, followed by a possibly sharp COVID-related collapse and a slow, and recovery afterward.
# Central Tendency
summary(Transit_ts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 171450 772502 824766 779435 866164 993437
boxplot(Transit_ts)

# The summary statistics indicate that transit ridership was generally high and stable, with most monthly values clustered between roughly 1st quartile and 3rd quartile, and a median of 824,766. The mean is slightly lower than the median, suggesting a slight left-skew influenced by lower values.
# The box plot shows generally stable pre-2020 ridership levels with a tight central range, alongside many low-value outliers caused by the COVID-related collapse in transit usage.
# Decomposition
decomp <- stl(Transit_ts,s.window ="periodic")
plot(decomp)

attributes(decomp)
## $names
## [1] "time.series" "weights" "call" "win" "deg"
## [6] "jump" "inner" "outer"
##
## $class
## [1] "stl"
# The decomposition clearly shows repeating cycles each year, with peaks and troughs occurring at roughly the same months.That means the series is seasonal, exhibiting a strong annual pattern.
# The decomposition is additive, because the seasonal component has a constant magnitude over time.
# Monthly Indices
monthly_indices <- seasonally_adjusted <- decomp$time.series[, "seasonal"]
monthly_indices[1:12]
## [1] -28344.4773 -44993.0331 32372.4450 -3295.0362 15894.3269 -10193.2025
## [7] -21748.6764 -220.9989 22783.8112 70049.5800 -2440.5805 -29864.1587
# The series shows its peak in October and its lowest level in February.
# Ridership is likely higher in October due to stable commuting patterns, mild weather, and active work and school schedules. In contrast, the cold winter conditions and reduced travel in February contribute to lower ridership levels.
# Seasonality Adjustment
Transit_adj <- seasadj(decomp)
plot(Transit_ts)
lines(Transit_adj, col="Red")

# Seasonality has a noticeable impact on the series, with monthly ridership fluctuating by roughly 30,000 to 70,000 riders depending on the time of year.
# Naive Method
naive_model <- naive(Transit_ts)
plot(naive_model$residuals)

# The residuals are mostly centered around zero with no strong recurring pattern, suggesting the model fits the data reasonably well under normal conditions. However, the large spike in residuals around 2020 indicates that the model fails to capture the abrupt COVID-related collapse in ridership.
hist(naive_model$residuals)

# The residuals are mostly clustered around zero, but the histogram is left-skewed due to several very large negative residuals from the COVID-19 collapse. This indicates that while the model fits normal periods reasonably well, it does not handle the sudden structural break in 2020.
# fitted values vs residuals
plot(naive_model$fitted, naive_model$residuals)
abline(h = 0, col = "red")

# The residuals are mostly centered around zero, but the presence of several large negative outliers indicates that the model fails to account for the abrupt COVID-related decline. This suggests the model fits typical periods reasonably well but does not capture structural shocks.
# actual values vs residuals
plot(Transit_ts, naive_model$residuals)
abline(h = 0, col = "red")

# The plot shows that most residuals cluster around zero, indicating that the model fits the majority of the data well. However, there are several large negative residuals when actual ridership is very low, reflecting the pandemic collapse that the model was unable to predict.
# ACF of Residuals
Acf(naive_model$residuals)

# The ACF plot shows significant autocorrelation at multiple lags, especially at lag 12, indicating that the residuals are not random. This means the naive model does not adequately capture the seasonal structure of the time series.
# Accuracy
accuracy(naive_model)
## ME RMSE MAE MPE MAPE MASE
## Training set -683.4326 55803.6 41483.85 -0.6755225 6.063339 0.7298063
## ACF1
## Training set -0.1325041
#Forecast
naive_forecast <- naive(Transit_ts,12)
plot(naive_forecast)

summary(naive_forecast)
##
## Forecast method: Naive method
##
## Model Information:
## Call: naive(y = Transit_ts, h = 12)
##
## Residual sd: 55803.6048
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -683.4326 55803.6 41483.85 -0.6755225 6.063339 0.7298063
## ACF1
## Training set -0.1325041
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Aug 2023 532206 460690.8 603721.2 422832.9 641579.1
## Sep 2023 532206 431068.2 633343.8 377529.1 686882.9
## Oct 2023 532206 408338.0 656074.0 342766.3 721645.7
## Nov 2023 532206 389175.6 675236.4 313459.9 750952.1
## Dec 2023 532206 372293.2 692118.8 287640.4 776771.6
## Jan 2024 532206 357030.3 707381.7 264297.8 800114.2
## Feb 2024 532206 342994.6 721417.4 242832.1 821579.9
## Mar 2024 532206 329930.5 734481.5 222852.3 841559.7
## Apr 2024 532206 317660.4 746751.6 204086.8 860325.2
## May 2024 532206 306055.1 758356.9 186338.0 878074.0
## Jun 2024 532206 295016.9 769395.1 169456.6 894955.4
## Jul 2024 532206 284470.1 779941.9 153326.6 911085.4
# The naive forecasting model has moderate accuracy, with a MAPE of about 6%, meaning its predictions are generally close during normal periods but do not capture sudden changes like COVID. Naive model is simple and works for stable patterns, but it does not account for trend or seasonality.
# Simple Moving Averages
attributes(Transit_ts)
## $tsp
## [1] 2000.0 2023.5 12.0
##
## $class
## [1] "ts"
frequency(Transit_ts)
## [1] 12
start(Transit_ts)
## [1] 2000 1
end(Transit_ts)
## [1] 2023 7
plot(Transit_ts)

MA3 <- ma(Transit_ts,order=3)
plot(MA3)

MA6 <- ma(Transit_ts,order=6)
plot(MA6)

MA9 <- ma(Transit_ts,order=9)
plot(MA9)

plot(Transit_ts)
lines(Transit_ts)
lines(MA3, col = "red", lwd = 2)
lines(MA6, col = "blue", lwd = 2)
lines(MA9, col = "green", lwd = 2)

# forecast for the next 12 month
MA3_forecast <- forecast(MA3, h = 12)
## Warning in ets(object, lambda = lambda, biasadj = biasadj,
## allow.multiplicative.trend = allow.multiplicative.trend, : Missing values
## encountered. Using longest contiguous portion of time series
MA6_forecast <- forecast(MA6, h = 12)
## Warning in ets(object, lambda = lambda, biasadj = biasadj,
## allow.multiplicative.trend = allow.multiplicative.trend, : Missing values
## encountered. Using longest contiguous portion of time series
MA9_forecast <- forecast(MA9, h = 12)
## Warning in ets(object, lambda = lambda, biasadj = biasadj,
## allow.multiplicative.trend = allow.multiplicative.trend, : Missing values
## encountered. Using longest contiguous portion of time series
summary(MA9_forecast)
##
## Forecast method: ETS(A,Ad,N)
##
## Model Information:
## ETS(A,Ad,N)
##
## Call:
## ets(y = object, lambda = lambda, biasadj = biasadj, allow.multiplicative.trend = allow.multiplicative.trend)
##
## Smoothing parameters:
## alpha = 0.9999
## beta = 0.9999
## phi = 0.8
##
## Initial states:
## l = 790233.3054
## b = 2301.1594
##
## sigma: 8771.733
##
## AIC AICc BIC
## 6545.175 6545.489 6566.876
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -195.8352 8691.624 6867.204 0.00766116 0.9437753 0.1484077
## ACF1
## Training set 0.06643575
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2023 548293.2 537051.7 559534.6 531100.9 565485.4
## May 2023 544672.6 521526.8 567818.3 509274.2 580070.9
## Jun 2023 541776.1 505888.4 577663.8 486890.5 596661.6
## Jul 2023 539458.9 490582.2 588335.6 464708.4 614209.4
## Aug 2023 537605.2 475825.9 599384.4 443122.0 632088.4
## Sep 2023 536122.2 461715.7 610528.7 422327.2 649917.1
## Oct 2023 534935.8 448281.0 621590.5 402408.7 667462.8
## Nov 2023 533986.6 435514.7 632458.6 383386.8 684586.5
## Dec 2023 533227.3 423389.4 643065.2 365244.8 701209.9
## Jan 2024 532619.9 411867.7 653372.1 347945.3 717294.5
## Feb 2024 532133.9 400907.4 663360.4 331440.3 732827.5
## Mar 2024 531745.2 390465.9 673024.4 315677.2 747813.1
accuracy(MA9_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -195.8352 8691.624 6867.204 0.00766116 0.9437753 0.1484077
## ACF1
## Training set 0.06643575
# As the moving average order increases from 3 to 9, the plot becomes smoother and less noisy, but also lags behind the actual data, so it better for long-term trend identification but less effective for short-term forecasting.
# Simple Smoothing
SSE_forecast <- ses(Transit_ts,h=12)
SSE_forecast
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Aug 2023 539578.3 468678.2 610478.5 431145.9 648010.7
## Sep 2023 539578.3 447103.9 632052.8 398150.9 681005.8
## Oct 2023 539578.3 429686.5 649470.1 371513.3 707643.4
## Nov 2023 539578.3 414674.8 664481.9 348554.8 730601.9
## Dec 2023 539578.3 401283.0 677873.6 328073.9 751082.8
## Jan 2024 539578.3 389078.2 690078.4 309408.3 769748.4
## Feb 2024 539578.3 377791.5 701365.1 292146.7 787009.9
## Mar 2024 539578.3 367242.4 711914.2 276013.3 803143.4
## Apr 2024 539578.3 357302.8 721853.8 260812.0 818344.6
## May 2024 539578.3 347877.9 731278.7 246397.9 832758.8
## Jun 2024 539578.3 338895.2 740261.5 232659.9 846496.8
## Jul 2024 539578.3 330297.6 748859.1 219511.1 859645.6
summary(SSE_forecast)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = Transit_ts, h = 12)
##
## Smoothing parameters:
## alpha = 0.8374
##
## Initial states:
## l = 732001.6666
##
## sigma: 55323.68
##
## AIC AICc BIC
## 7782.916 7783.002 7793.852
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -812.0035 55127.84 40313.36 -0.8363806 6.026904 0.7092143
## ACF1
## Training set 0.01607982
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Aug 2023 539578.3 468678.2 610478.5 431145.9 648010.7
## Sep 2023 539578.3 447103.9 632052.8 398150.9 681005.8
## Oct 2023 539578.3 429686.5 649470.1 371513.3 707643.4
## Nov 2023 539578.3 414674.8 664481.9 348554.8 730601.9
## Dec 2023 539578.3 401283.0 677873.6 328073.9 751082.8
## Jan 2024 539578.3 389078.2 690078.4 309408.3 769748.4
## Feb 2024 539578.3 377791.5 701365.1 292146.7 787009.9
## Mar 2024 539578.3 367242.4 711914.2 276013.3 803143.4
## Apr 2024 539578.3 357302.8 721853.8 260812.0 818344.6
## May 2024 539578.3 347877.9 731278.7 246397.9 832758.8
## Jun 2024 539578.3 338895.2 740261.5 232659.9 846496.8
## Jul 2024 539578.3 330297.6 748859.1 219511.1 859645.6
# As the showing result, the smoothing parameter alpha is approximately 0.8374, meaning the model assigns 84% weight to the most recent observation, resulting in a model that responds quickly to recent changes. The initial level 732,001.67 represents the starting smoothed estimate of the series. The sigma value of approximately 55,324 reflects the variability of the residuals, suggesting moderate forecast error but generally stable performance.
# Residual Analysis
plot(SSE_forecast$residuals)

# The residual plot shows no consistent trend or repeating pattern, indicating that the SES model has captured the overall level of the series fairly well. However, there is a noticeable large negative residual around 2020, which reflects the sudden drop in ridership during pandemic that the model could not anticipate.
hist(SSE_forecast$residuals)

# The histogram shows most residuals centered near zero, but a long left tail caused by the COVID-related drop indicates the model could not capture the sudden structural change.
# fitted values vs residuals
plot(SSE_forecast$fitted, SSE_forecast$residuals)
abline(h = 0, col = "red")

# The fitted-vs-residuals plot for the SES model shows mostly random scatter around zero, indicating a better fit than the naive and moving average models, though large negative residuals during the COVID drop remain unaccounted for.
# actual values vs residuals
plot(Transit_ts, SSE_forecast$residuals)
abline(h = 0, col = "red")

# The plot shows that most residuals are centered around zero, indicating a reasonable fit, but the large negative residuals when actual values are low show that the model still fails to capture the sudden structural change in the series.
# ACF of Residuals
Acf(SSE_forecast$residuals)

# The ACF plot of the SES residuals shows fewer significant autocorrelations than the naive and moving average models, indicating that SES captures the underlying level better; however, the spike around lag 12 shows remaining seasonality that the model does not fully remove.
# Accuracy
accuracy(SSE_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -812.0035 55127.84 40313.36 -0.8363806 6.026904 0.7092143
## ACF1
## Training set 0.01607982
# The SES model shows better accuracy than the naive and moving average models (lower RMSE and MAE), meaning it fits the data more closely. It forecasts that ridership in one year will stay close to the current level, since it weights recent data more heavily. However, it still does not fully capture seasonality or the sharp pandemic drop.
# Holt-Winters
HW <- HoltWinters(Transit_ts)
plot(HW)

HW_forecast <- forecast(Transit_ts,h=12)
plot(HW_forecast)

summary(HW_forecast)
##
## Forecast method: ETS(A,N,A)
##
## Model Information:
## ETS(A,N,A)
##
## Call:
## ets(y = object, lambda = lambda, biasadj = biasadj, allow.multiplicative.trend = allow.multiplicative.trend)
##
## Smoothing parameters:
## alpha = 0.9999
## gamma = 1e-04
##
## Initial states:
## l = 805081.3493
## s = -29928.9 -2604.664 70008.64 22793.36 -189.8851 -20385.34
## -10941.57 16412.36 -1776.894 31546.4 -45757.8 -29175.71
##
## sigma: 41374.59
##
## AIC AICc BIC
## 7630.122 7631.920 7684.804
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -892.2577 40338.21 25687.09 -0.5431477 4.095672 0.4519012
## ACF1
## Training set 0.009813671
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Aug 2023 552406.3 499382.7 605430.0 471313.6 633499.0
## Sep 2023 575389.4 500406.4 650372.4 460712.8 690065.9
## Oct 2023 622604.8 530771.4 714438.3 482157.7 763052.0
## Nov 2023 549991.7 443952.5 656031.0 387818.7 712164.8
## Dec 2023 522667.4 404112.5 641222.3 341353.3 703981.5
## Jan 2024 523420.1 393550.2 653290.1 324801.2 722039.1
## Feb 2024 506838.9 366563.7 647114.1 292306.4 721371.3
## Mar 2024 584143.3 434183.1 734103.6 354798.9 813487.7
## Apr 2024 550814.2 391757.5 709870.8 307558.0 794070.3
## May 2024 569011.4 401351.2 736671.7 312597.2 825425.6
## Jun 2024 541658.0 365814.6 717501.3 272728.7 810587.2
## Jul 2024 532206.0 348542.2 715869.8 251316.5 813095.6
# The Holt-Winters model estimated alpha = 0.9999, meaning it gives almost full weight to the most recent data and adjusts the level very quickly. There is no beta term because the model includes no trend component. Gamma = 0.0001 indicates that the seasonal pattern is very stable over time. The initial level (= 805,081) represents the baseline ridership, and the seasonal values show which months are above or below average. Sigma = 41,375 indicates the typical size of forecast errors, reflecting a relatively good model fit compared to simpler methods.
# Residual Analysis
plot(HW_forecast$residuals)

# Residuals are mostly random around zero, showing Holt-Winters fits better than naïve, moving average, and SES, except during the pandemic drop.
hist(HW_forecast$residuals)

# The histogram is centered near zero with a smaller spread, indicating improved accuracy, though a left tail remains due to pandemic.
# fitted values vs residuals
plot(HW_forecast$fitted, HW_forecast$residuals)
abline(h = 0, col = "red")

# Residuals show no pattern and are tighter than in the naive, moving average, and SES models, indicating a better fit.
# actual values vs residuals
plot(Transit_ts, HW_forecast$residuals)
abline(h = 0, col = "red")

# Most residuals are small, with Holt-Winters reducing error more effectively than the other models, except during the pandemic shock.
# ACF of Residuals
Acf(HW_forecast$residuals)

# The ACF plot of the SES residuals shows fewer significant autocorrelations than the naive and moving average models, indicating that SES captures the underlying level better; however, the spike around lag 12 shows remaining seasonality that the model does not fully remove.
# Accuracy
accuracy(HW_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -892.2577 40338.21 25687.09 -0.5431477 4.095672 0.4519012
## ACF1
## Training set 0.009813671
# The ACF of residuals shows little to no significant autocorrelation, indicating the model has successfully captured both level and seasonality. This is a clear improvement compared to naïve and moving average models, which showed strong autocorrelation, and a modest improvement over SES, which struggled with seasonal structure.
# Accuracy
accuracy(HW_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -892.2577 40338.21 25687.09 -0.5431477 4.095672 0.4519012
## ACF1
## Training set 0.009813671
# The Holt-Winters model shows the best accuracy among the naïve, moving average, and SES models, with the lowest RMSE and MAE, meaning it fits the data more closely. It predicts that ridership next year will remain near the current post-COVID levels, while still reflecting the seasonal ups and downs.
# Accuracy Summary
accuracy(naive_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -683.4326 55803.6 41483.85 -0.6755225 6.063339 0.7298063
## ACF1
## Training set -0.1325041
accuracy(MA3_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -474.3732 21447.51 9418.979 -0.06588614 1.651357 0.1843854
## ACF1
## Training set 0.5855302
accuracy(MA6_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -90.14435 7087.279 4032.781 0.05631012 0.631004 0.08383187
## ACF1
## Training set 0.4079666
accuracy(MA9_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -195.8352 8691.624 6867.204 0.00766116 0.9437753 0.1484077
## ACF1
## Training set 0.06643575
accuracy(SSE_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -812.0035 55127.84 40313.36 -0.8363806 6.026904 0.7092143
## ACF1
## Training set 0.01607982
accuracy(HW_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -892.2577 40338.21 25687.09 -0.5431477 4.095672 0.4519012
## ACF1
## Training set 0.009813671
acc_Naive <- accuracy(naive_forecast)
acc_MA3 <- accuracy(MA3_forecast)
acc_MA6 <- accuracy(MA6_forecast)
acc_MA9 <- accuracy(MA9_forecast)
acc_SES <- accuracy(SSE_forecast)
acc_HW <- accuracy(HW_forecast)
accuracy_table <- rbind("Naive" = acc_Naive, "MA3" = acc_MA3, "MA6" = acc_MA6, "MA9" = acc_MA9, "SES" = acc_SES, "HW" = acc_HW)
accuracy_table <- rbind(
"Naive" = acc_Naive["Training set", ],
"MA3" = acc_MA3["Training set", ],
"MA6" = acc_MA6["Training set", ],
"MA9" = acc_MA9["Training set", ],
"SES" = acc_SES["Training set", ],
"HW" = acc_HW["Training set", ]
)
round(accuracy_table, 4)
## ME RMSE MAE MPE MAPE MASE ACF1
## Naive -683.4326 55803.605 41483.851 -0.6755 6.0633 0.7298 -0.1325
## MA3 -474.3732 21447.510 9418.979 -0.0659 1.6514 0.1844 0.5855
## MA6 -90.1444 7087.279 4032.781 0.0563 0.6310 0.0838 0.4080
## MA9 -195.8352 8691.624 6867.204 0.0077 0.9438 0.1484 0.0664
## SES -812.0035 55127.840 40313.357 -0.8364 6.0269 0.7092 0.0161
## HW -892.2577 40338.207 25687.094 -0.5431 4.0957 0.4519 0.0098
# I chose RMSE as the accuracy measure because it penalizes large forecast errors more heavily, which is important for this dataset since it contains a large shock. Based on RMSE, the best model is MA6 (RMSE = 7,087), and the worst model is SES (RMSE = 55,128) among the methods tested. MA6 performs best because its smoothing window balances noise reduction and responsiveness, while SES performs worst because it overweights recent observations and cannot handle the large structural change.
# Conclusion
# The transit ridership series shows stable and strong seasonal patterns before 2020, followed by a sharp drop during the COVID-19 period and a gradual recovery afterward. This indicates that while the long-term demand trend was steady, the pandemic created a temporary structural break in the series. Based on the best-performing forecast model, ridership is expected to continue increasing slowly over the next year as recovery progresses. Over the next two years, the series is likely to move closer to pre-pandemic levels, but may not fully return to the previous peak.